---
title: "Making Us Sick? Effects of Partisan Polarization on Health"
author: "Costas Panagopoulos, Daniel P. Aldrich,Dan Kim, David Hummel, Tim Fraser"
date: "April 29, 2021"
output: html_notebook
---

This RMarkdown document outlines the code necessary to compare our survey results with county level results from the BRFSS and develop a set of models for the effects of social capital and polarization on health.

# 0. Packages

```{r, message=FALSE, warning = FALSE}
library(tidyverse)
library(Zelig)
select <- dplyr::select
dir.create("viz")
```

# 1. Build dataset

First, import key variables for analysis from the survey dataset.
This dataset was compiled and processed in another study.
For replication code for the compilation of the original study, see the following dataset on the Harvard Dataverse: https://doi.org/10.7910/DVN/LQVAAR

```{r, message = FALSE, warning = FALSE, eval = FALSE}
library(tidyverse)
# Grab from original survey document, which we do not distribute
read_rds("dataset.rds") %>%
  select(id, date,
         state, zip, 
         # Outcomes
         days_poor_physical_health, days_poor_mental_health, 
         bmi, bmi_2015, 
         # Health
         high_risk_from_smoking, risk_from_smoking, age,
         
         # Polarization
         self, self_2016, 
         diff_self_state_voter_avg, diff_self_state_elect_avg,
         diff_self_us_voter_avg, diff_self_us_elect_avg, 
         partisan_gap_state_voter, partisan_gap_state_elect, 
         partisan_gap_us_voter, partisan_gap_us_elect,
         
         party_7,
         party_3,
         # demographics
         race, marriage, income, gender, health_insurance, education,
         employment, religion, immigrant_generation, hispanic,
         # extra demographics variables
         female, poverty, bachelor, 
         labor_force, marriage2, uninsured) %>%
  saveRDS("dataset_making_us_sick.rds")
```

# 2. Validation

First, let's check what our survey looks like. Is it demographically representative of the US? (yes!)

## Representativeness

```{r}
library(tidyverse)
options(stringsAsFactors = FALSE)

dat <- read_rds("dataset_making_us_sick.rds")

# We're going to use population level statistics 
# from the Census QuickFacts Site, as well as a few others,
# to record the percentage of the population that is X group for each of several different classifications.

# Quick Facts:
#https://www.census.gov/quickfacts/fact/table/US/PST045219
#https://www.cdc.gov/tobacco/data_statistics/fact_sheets/adult_data/cig_smoking/index.htm#:~:text=In%202019%2C%20nearly%2014%20of,with%20a%20smoking%2Drelated%20disease.

# https://www.census.gov/data/tables/2018/demo/families/cps-2018.html
# https://www2.census.gov/programs-surveys/demo/tables/families/2018/cps-2018/taba1-all.xls

pop <- bind_rows(
  data.frame(
    group = c("Yes", "No"),
    freq = c(0.492, 0.508),
    type = "Female"),
  
  data.frame(
    group = c("White", "Black", "Hispanic", "Asian", "Other"),
    freq = c(.604, .134, .183, .059, 0.02),
    type = "Race (Qualtrics)"),
  
  data.frame(
    group = c("Yes", "No"),
    freq =  c(0.629, .371),
    type = "In Labor Force"),
  
  data.frame(
    group = c("Yes", "No"),
    freq =  c(.10, .90),
    type = "Uninsured"),
  
  data.frame(
    group = c("Yes", "No"),
    freq = c(.118, .882),
    type = "Income\nUnder $10,000"),
  
  data.frame(
    group = c("Yes", "No"),
    freq =  c(.315, .685),
    type = "Completed\nBachelor's Degree"),
  
  data.frame(
    group = c("Yes", "No"),
    freq = c(.14, 0.86),
    type = "At Risk\nfrom Smoking"),
  
  data.frame(
    group = c("Married","Never married","Divorced",
               #"A member of an unmarried couple", (added to never married)
               "Separated","Widowed"),
  freq = c((0.486+0.015), 0.323, 0.099, 0.019, 0.058),
  type = "Marital Status"))

# Now compare those population level stats with the survey
compare <- dat %>%
  # For the following variables
  select(id, "female", "race", "poverty", "bachelor", 
         "labor_force", "marriage2", risk_from_smoking = risk_from_smoking, "uninsured") %>%
  # pivot longer for tidy calculation
  pivot_longer(cols = -c(id), names_to = "type", values_to = "group") %>%
  # Calculate how many are in each group
  group_by(type, group) %>%
  summarize(freq = n()) %>%
  # now turn that into a frequency
  ungroup() %>%
  group_by(type) %>%
  mutate(freq = round(freq / sum(freq, na.rm = TRUE), 3)) %>%
  ungroup() %>%
  # bind-in population level status
  bind_rows(pop, .id = "sample") %>%
  mutate(sample = sample %>% recode_factor(
    "1" = "Survey Sample",
    "2" = "US Population")) %>%
    # And update labels
  mutate(type = type %>% dplyr::recode_factor(
    "bachelor" = "Completed\nBachelor's Degree",
    "female" = "Female",
    "race" = "Race (Qualtrics)",
    "labor_force" = "In Labor Force",
    "poverty" = "Income\nUnder $10,000",
    "marriage2" = "Marital Status",
    "risk_from_smoking" = "At Risk\nfrom Smoking",
    "uninsured" = "Uninsured"),
    group = group %>% dplyr::recode("aa" = "Black",
                                    "white" = "White",
                                    "hispanic" = "Hispanic",
                                    "asian" = "Asian",
                                    "other" = "Other",
                                    "yes" = "Yes",
                                    "no" = "No",
                                    "in labor force" = "Yes",
                                    "not in labor force" = "No")) %>%
  mutate(group = factor(group, 
                        levels = c( "Yes", "No",
                        "White", "Hispanic", "Black",
                        "Asian", "Other", 
                        "Married", "Never married",
                        "Divorced", "Widowed", "Separated") %>% rev())) %>%
  mutate(myorder = as.numeric(type)) 

compare %>%
  ggplot(mapping = aes(x = group, y = freq*100,
                       color = sample,
                       fill = sample,
                       label = round(freq*100, 2))) +
  geom_col(position = "dodge", color = "white") +
  geom_text(position = position_dodge(width = 1), hjust = -0.1, size = 2.5) +
  facet_wrap(~reorder(type, myorder), scales = "free_y",ncol = 4) +
  theme_classic(base_size = 12) +
  scale_y_continuous(breaks = c(0, 25, 50, 75, 100)) +
  theme(legend.position = "top", legend.box = "vertical",
        axis.text = element_text(angle = 0.180),
        panel.border = element_rect(color = "black", fill = NA)) +
  labs(x = "Response", y = "% Respondents", fill = "Type") +
  scale_fill_grey() +
  scale_color_grey(start = 0, end = 0.4) +
  ylim(0, 106) +
  coord_flip() +
  guides(color = FALSE) +
  ggsave("viz/fig_1.png", dpi = 500, height = 4, width = 8)



# Now, generate those traits again...

remove(compare, pop)
```



\newpage

# 3. Descriptive Analysis


## 3.1 Descriptive Statistics

First, let's generate descriptive statistics on our main outcome variables, the number of days of poor physical and mental health a person experiences per month.

```{r}
read_rds("dataset_making_us_sick.rds") %>%
  select(id, days_poor_physical_health, days_poor_mental_health) %>%
  pivot_longer(
    cols = -c(id),
    names_to = "measure",
    values_to = "value") %>%
  mutate(measure = measure %>% recode_factor(
        "days_poor_physical_health" = "Days of Poor Physical Health",
    "days_poor_mental_health" = "Days of Poor Mental Health")) %>%
  ggplot(mapping = aes(x = value, group = measure, fill = measure)) +
  geom_histogram(aes(y =..density..), color = "white", bins = 10) + 
  geom_density(col = "black", alpha = 0.5) +
#  geom_histogram(alpha = 0.75, color = "black") +
  theme_classic(base_size = 14) +
  theme(panel.border = element_rect(color = "black", fill = NA)) +
  facet_wrap(~measure) +
  labs(x = "Days of Poor Health Reported in the Last Month", y = "(%) Respondents") +
  guides(fill = FALSE) +
  scale_fill_manual(values = c("black", "darkgrey")) +
  ggsave("viz/fig_a1.png", dpi = 500, width = 6, height = 3)
```

## 3.2 Missing Data


# 4. Multiple Imputation

## Overall

```{r}
library(Amelia)

# First, use individual level survey 
dat <- read_rds("dataset.rds") %>%
  select(id, state, days_poor_physical_health, days_poor_mental_health,
         polarized_aggregate,
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking,
         "diff_self_us_voter_avg", "diff_self_state_voter_avg",
         "bmi", "party_7", "age", "income2") %>%
  mutate(freq_physical_distress = if_else(days_poor_physical_health >= 14, 1, 0),
         freq_mental_distress = if_else(days_poor_mental_health >= 14, 1, 0))  %>%
  mutate_at(vars(c("race", "female", "somecollege", "polarized_aggregate",
                  "nevermarried", "employment", "uninsured",
                  "religion", "urban", "risk_from_smoking", 
                  "freq_physical_distress", "freq_mental_distress", "state", "urban")),
            funs(as.factor(.))) %>%
  mutate(polarized_aggregate = polarized_aggregate %>% relevel(ref = "same")) %>%
  select(id, "state",
         days_poor_physical_health, days_poor_mental_health,
         freq_physical_distress, freq_mental_distress,
         # ordinal variables
         party_7,income2,
         # numeric
         bmi, age, 
         diff_self_us_voter_avg, diff_self_state_voter_avg,
         # categorical
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking, polarized_aggregate) %>%
  mutate(race = relevel(race, ref = "white")) %>%
  as.data.frame() 

# Create a set of logical bounds based on min and max
mybounds <- bind_cols(
  # Get lower bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "freq_physical_distress", "freq_mental_distress",
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(min(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(lower.bound = .),
  # Get upper bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "freq_physical_distress", "freq_mental_distress",
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(max(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(upper.bound = .)
) %>%
  # Add column name for easy join
  tibble::rownames_to_column(var = "term") %>%
  # join in the number of the column,
  # having already subtracted the id numbers
  left_join(by = "term",
            y = dat %>%
              names(.) %>%
              data.frame(term = .) %>%
              mutate(column.number = 1:n())) %>%
  select(term, column.number, lower.bound, upper.bound) 


# Set a seed for replication
set.seed(15567)

# Run multiple imputation sequence
library(Amelia)
dat %>% 
  amelia(m = 5, idvars = c("id", "state"), ords = c("party_7", "income2"),
         noms = c(
           "freq_physical_distress", "freq_mental_distress",
           "race", "female", "somecollege",
           "nevermarried", "employment", "uninsured", "religion",
           "urban", "risk_from_smoking", "polarized_aggregate"),
         bounds = mybounds %>% select(-term) %>% as.matrix(), 
         max.resample = 1000) %>%
  saveRDS("dataset_mi.rds")


```

## Count

Now, repeat but just for individuals who reported 1 day or more of poor health.

### Physical Health

```{r}
library(Amelia)

# First, use individual level survey 
dat <- read_rds("dataset.rds") %>%
  filter(days_poor_physical_health > 0) %>%
  select(id, state, days_poor_physical_health, days_poor_mental_health,
         polarized_aggregate,
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking,
         "diff_self_us_voter_avg", "diff_self_state_voter_avg",
         "bmi", "party_7", "age", "income2") %>%
  mutate(freq_physical_distress = if_else(days_poor_physical_health >= 14, 1, 0),
         freq_mental_distress = if_else(days_poor_mental_health >= 14, 1, 0))  %>%
  mutate_at(vars(c("race", "female", "somecollege", "polarized_aggregate",
                  "nevermarried", "employment", "uninsured",
                  "religion", "urban", "risk_from_smoking", 
                  "freq_physical_distress", "freq_mental_distress", "state", "urban")),
            funs(as.factor(.))) %>%
  mutate(polarized_aggregate = polarized_aggregate %>% relevel(ref = "same")) %>%
  select(id, "state",
         days_poor_physical_health, days_poor_mental_health,
         freq_physical_distress, freq_mental_distress,
         # ordinal variables
         party_7,income2,
         # numeric
         bmi, age, 
         diff_self_us_voter_avg, diff_self_state_voter_avg,
         # categorical
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking, polarized_aggregate) %>%
  mutate(race = relevel(race, ref = "white")) %>%
  as.data.frame() 

# Create a set of logical bounds based on min and max
mybounds <- bind_cols(
  # Get lower bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "freq_physical_distress", "freq_mental_distress",
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(min(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(lower.bound = .),
  # Get upper bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "freq_physical_distress", "freq_mental_distress",
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(max(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(upper.bound = .)
) %>%
  # Add column name for easy join
  tibble::rownames_to_column(var = "term") %>%
  # join in the number of the column,
  # having already subtracted the id numbers
  left_join(by = "term",
            y = dat %>%
              names(.) %>%
              data.frame(term = .) %>%
              mutate(column.number = 1:n())) %>%
  select(term, column.number, lower.bound, upper.bound) 


# Set a seed for replication
set.seed(15567)

# Run multiple imputation sequence
library(Amelia)
dat %>% 
  amelia(m = 5, idvars = c("id", "state"), ords = c("party_7", "income2"),
         noms = c(
           "freq_physical_distress", "freq_mental_distress",
           "race", "female", "somecollege",
           "nevermarried", "employment", "uninsured", "religion",
           "urban", "risk_from_smoking", "polarized_aggregate"),
         bounds = mybounds %>% select(-term) %>% as.matrix(), 
         max.resample = 1000) %>%
  saveRDS("dataset_mi_phys.rds")

```





#### Over Median

```{r}
library(Amelia)
library(tidyverse)

# First, use individual level survey 
dat <- read_rds("dataset.rds") %>%
  filter(days_poor_physical_health > 0) %>%
  filter(days_poor_physical_health >= median(days_poor_physical_health, na.rm = TRUE)) %>%
  select(id, state, days_poor_physical_health, days_poor_mental_health,
         polarized_aggregate,
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking,
         "diff_self_us_voter_avg", "diff_self_state_voter_avg",
         "bmi", "party_7", "age", "income2") %>%
  mutate_at(vars(c("race", "female", "somecollege", "polarized_aggregate",
                  "nevermarried", "employment", "uninsured",
                  "religion", "urban", "risk_from_smoking", "state", "urban")),
            funs(as.factor(.))) %>%
  mutate(polarized_aggregate = polarized_aggregate %>% relevel(ref = "same")) %>%
  select(id, "state",
         days_poor_physical_health, days_poor_mental_health,
         # ordinal variables
         party_7,income2,
         # numeric
         bmi, age, 
         diff_self_us_voter_avg, diff_self_state_voter_avg,
         # categorical
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking, polarized_aggregate) %>%
  mutate(race = relevel(race, ref = "white")) %>%
  as.data.frame() 

# Create a set of logical bounds based on min and max
mybounds <- bind_cols(
  # Get lower bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(min(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(lower.bound = .),
  # Get upper bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(max(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(upper.bound = .)
) %>%
  # Add column name for easy join
  tibble::rownames_to_column(var = "term") %>%
  # join in the number of the column,
  # having already subtracted the id numbers
  left_join(by = "term",
            y = dat %>%
              names(.) %>%
              data.frame(term = .) %>%
              mutate(column.number = 1:n())) %>%
  select(term, column.number, lower.bound, upper.bound) 


# Set a seed for replication
set.seed(15567)

# Run multiple imputation sequence
library(Amelia)
  select(id, state, contains("diff"), bmi, age, 
         party_7, income2, 
         race, female, somecollege, nevermarried,
         employment, uninsured, religion, 
         risk_from_smoking) %>%
  as.data.frame() %>%
  Amelia::amelia(m = 5, idvars = c("id", "state"), 
                 ords = c("party_7", "income2"),
                 noms = c("race", "female", "somecollege",
                          "nevermarried", "employment",
                          "uninsured", "religion",
                           "risk_from_smoking"),
                 max.resample = 1000) %>%
  saveRDS("dataset_mi_phys_over_median.rds")

```



#### Under Median

```{r}
library(Amelia)
library(tidyverse)

# First, use individual level survey 
dat <- read_rds("dataset.rds") %>%
  filter(days_poor_physical_health > 0) %>%
  filter(days_poor_physical_health < median(days_poor_physical_health, na.rm = TRUE)) %>%
  select(id, state, days_poor_physical_health, days_poor_mental_health,
         polarized_aggregate,
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking,
         "diff_self_us_voter_avg", "diff_self_state_voter_avg",
         "bmi", "party_7", "age", "income2") %>%
  mutate_at(vars(c("race", "female", "somecollege", "polarized_aggregate",
                  "nevermarried", "employment", "uninsured",
                  "religion", "urban", "risk_from_smoking", "state", "urban")),
            funs(as.factor(.))) %>%
  mutate(polarized_aggregate = polarized_aggregate %>% relevel(ref = "same")) %>%
  select(id, "state",
         days_poor_physical_health, days_poor_mental_health,
         # ordinal variables
         party_7,income2,
         # numeric
         bmi, age, 
         diff_self_us_voter_avg, diff_self_state_voter_avg,
         # categorical
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking, polarized_aggregate) %>%
  mutate(race = relevel(race, ref = "white")) %>%
  as.data.frame() 


# Create a set of logical bounds based on min and max
mybounds <- bind_cols(
  # Get lower bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(min(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(lower.bound = .),
  # Get upper bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(max(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(upper.bound = .)
) %>%
  # Add column name for easy join
  tibble::rownames_to_column(var = "term") %>%
  # join in the number of the column,
  # having already subtracted the id numbers
  left_join(by = "term",
            y = dat %>%
              names(.) %>%
              data.frame(term = .) %>%
              mutate(column.number = 1:n())) %>%
  select(term, column.number, lower.bound, upper.bound) 

# Set a seed for replication
set.seed(15567)

# Run multiple imputation sequence
library(Amelia)
library(tidyverse)
out <- dat %>% 
  select(id, state,contains("days"), contains("diff"), bmi, age, 
         party_7, income2, 
         race, female, somecollege, nevermarried,
         employment, uninsured, religion, 
         risk_from_smoking) %>%
  as.data.frame() %>%
  Amelia::amelia(m = 5, idvars = c("id", "state"), 
                 ords = c("party_7", "income2"),
                 noms = c("race", "female", "somecollege",
                          "nevermarried", "employment",
                          "uninsured", "religion",
                           "risk_from_smoking"),
                 max.resample = 1000) %>%
  saveRDS("dataset_mi_phys_under_median.rds")

```




### Mental Health

```{r}
library(Amelia)

# First, use individual level survey 
dat <- read_rds("dataset.rds") %>%
  filter(days_poor_mental_health > 0) %>%
  select(id, state, days_poor_physical_health, days_poor_mental_health,
         polarized_aggregate,
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking,
         "diff_self_us_voter_avg", "diff_self_state_voter_avg",
         "bmi", "party_7", "age", "income2") %>%
  mutate(freq_physical_distress = if_else(days_poor_physical_health >= 14, 1, 0),
         freq_mental_distress = if_else(days_poor_mental_health >= 14, 1, 0))  %>%
  mutate_at(vars(c("race", "female", "somecollege", "polarized_aggregate",
                  "nevermarried", "employment", "uninsured",
                  "religion", "urban", "risk_from_smoking", 
                  "freq_physical_distress", "freq_mental_distress", "state", "urban")),
            funs(as.factor(.))) %>%
  mutate(polarized_aggregate = polarized_aggregate %>% relevel(ref = "same")) %>%
  select(id, "state",
         days_poor_physical_health, days_poor_mental_health,
         freq_physical_distress, freq_mental_distress,
         # ordinal variables
         party_7,income2,
         # numeric
         bmi, age, 
         diff_self_us_voter_avg, diff_self_state_voter_avg,
         # categorical
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         urban, risk_from_smoking, polarized_aggregate) %>%
  mutate(race = relevel(race, ref = "white")) %>%
  as.data.frame() 

# Create a set of logical bounds based on min and max
mybounds <- bind_cols(
  # Get lower bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "freq_physical_distress", "freq_mental_distress",
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(min(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(lower.bound = .),
  # Get upper bounds
  dat %>%
    summarize_at(vars(-c(id,state,party_7,income2,
                         "freq_physical_distress", "freq_mental_distress",
                         "race", "female", "somecollege",
                         "nevermarried", "employment", "uninsured", "religion",
                         "urban", "risk_from_smoking", "polarized_aggregate")),
                 funs(max(., na.rm = TRUE))) %>%
    t() %>%
    data.frame(upper.bound = .)
) %>%
  # Add column name for easy join
  tibble::rownames_to_column(var = "term") %>%
  # join in the number of the column,
  # having already subtracted the id numbers
  left_join(by = "term",
            y = dat %>%
              names(.) %>%
              data.frame(term = .) %>%
              mutate(column.number = 1:n())) %>%
  select(term, column.number, lower.bound, upper.bound) 


# Set a seed for replication
set.seed(15567)

# Run multiple imputation sequence
library(Amelia)
dat %>% 
  amelia(m = 5, idvars = c("id", "state"), ords = c("party_7", "income2"),
         noms = c(
           "freq_physical_distress", "freq_mental_distress",
           "race", "female", "somecollege",
           "nevermarried", "employment", "uninsured", "religion",
           "urban", "risk_from_smoking", "polarized_aggregate"),
         bounds = mybounds %>% select(-term) %>% as.matrix(), 
         max.resample = 1000) %>%
  saveRDS("dataset_mi_ment.rds")

```


# 4. Analysis

## 4.1 Negative Binomial Models

```{r, message = FALSE, warning = FALSE}
# Load packages
library(tidyverse)
#library(mfx)
library(broom)
library(car)
library(texreg)
library(Zelig)
select <- dplyr::select


# Write basic model formulas
# Effect of State level polarization on physical health
phys_state = days_poor_physical_health ~ diff_self_state_voter_avg +
  bmi + party_7 + age + race + income2 + female + somecollege +
  nevermarried + employment + uninsured + religion +
  risk_from_smoking
# Effect of US level polarization on physical health
phys_us = days_poor_physical_health ~ diff_self_us_voter_avg +
  bmi + party_7 + age + race + income2 + female + somecollege +
  nevermarried + employment + uninsured + religion +
  risk_from_smoking
# Effect of State level polarization on mental health
ment_state = days_poor_mental_health ~ diff_self_state_voter_avg +
  bmi + party_7 + age + race + income2 + female + somecollege +
  nevermarried + employment + uninsured + religion +
  risk_from_smoking
# Effect of US level polarization on mental health
ment_us = days_poor_mental_health ~ diff_self_us_voter_avg +
  bmi + party_7 + age + race + income2 + female + somecollege +
  nevermarried + employment + uninsured + religion +
  risk_from_smoking

# Run Negative Binomial models and Zero Inflated Negative Binomial Models

# Import dataset
dat <- read_rds("dataset_mi.rds")

m1 <- dat %>% zelig(formula = phys_state, model = "negbin")
m2 <- dat %>% zelig(formula = phys_us, model = "negbin")
m3 <- dat %>% zelig(formula = ment_state, model = "negbin")
m4 <- dat %>% zelig(formula = ment_us, model = "negbin")

# Write new function for VIF
get_vif = function(model){
  myvif <- model %>%
    car::vif()
  if(is.matrix(myvif)){
    # Square the GVIF^(1/(2*Df)) to make scores for categorical and numeric variables
    # comparable on the original scale, where 2.5 is great and 10 is bad
    myvif[,3]^2 %>%
      mean() %>%
      return()
  }else{
    myvif %>%
      mean() %>%
      return()
  }
}


get_poisson= function(mymodel){
  mymodel$model %>% rename(outcome = 1) %>% 
    glm(formula = outcome ~ ., family = "poisson") %>%
    return()
}

get_intercept = function(model){
  
  mystat <- lmtest::lrtest(get_poisson(model), model) %>%
    as.data.frame() %>%
    slice(2)
  
  data.frame(rawstat = mystat$Chisq,
             cleanstat = paste(mystat$Chisq %>% round(0), 
                               gtools::stars.pval(mystat$`Pr(>Chisq)`), 
                               " (",mystat$`#Df`, ")", sep = "")) %>%
    return()
}

# Generate the table
texreg::htmlreg(
  # Drawing from these models
  list(m1,m2,m3,m4), 
  # Saved here
  file = "viz/table_b1.html",
  caption.above = TRUE, single.row = TRUE,
  caption = "<b>Negative Binomial Models of Days of Poor Physical or Mental Health Reported by Respondents (n = 2752)</b><br><i>Coefficients as Log-Odds with Standard Errors in parentheses and Multiple Imputation (n = 5)</i>",
  bold = 0.10,
  stars = c(0.001, 0.01, 0.05, 0.10),
  # With these titles
  custom.header = list("Days of Poor Physical Health" = 1:2, 
                       "Days of Poor Mental Health" = 3:4),
  custom.model.names = c("US Level<br>Polarization", 
                         "State Level<br>Polarization",
                         "US Level<br>Polarization", 
                         "State Level<br>Polarization"),
  # These coefficient names
  custom.coef.map = list(
    "diff_self_state_voter_avg" = "Polarization (<i>compared to Median State Voter</i>)",
    "diff_self_us_voter_avg" = "Polarization (<i>compared to Median US Voter</i>)",
    # Covariates
    "bmi" = "Body Mass Index",
    "risk_from_smokingyes" = "At Risk from Smoking",
    "uninsuredYes" = "Uninsured",
    "party_7" = "Party Identification (7pt) (Dem-Rep)",
    "age" = "Age",
    "femaleYes" = "Female",
    "income2" = "Income",
    "raceaa" = "Black", 
    "racehispanic" = "Hispanic", 
    "raceasian" = "Asian", 
    "raceother" = "Other race", 
    "somecollegeYes" = "At least some college", 
    "nevermarriedyes" = "Never Married", 
    "employmentnot in labor force" = "Not in Labor Force", 
    "employmentunemployed" = "Unemployed", 
    "religionProtestant" = "Protestant",
    "religionCatholic" = "Catholic", 
    "religionAnother type of Christian" = "Other Christian",
    "religionJewish" = "Jewish", 
    "religionMuslim" = "Muslim",
    "religionNo religion" = "No religion",  
    "religionSome other religion" = "Other religion",
    "(Intercept)" = "Constant"),  
  groups = list(
    "<b>Perceived Polarization</b>" = 1:2,
    "<b>Health Conditions</b>" = 3:4,
    "<b>Health Insurace</b>" = 5,
    "<b>Partisanship</b>" = 6,
    "<b>Basic Demographics</b>" = 7:14,
    "<b>Extended Demographics</b>" = 15:23),
  # And these custom Goodness of Fit Tests
  custom.gof.rows = list(
    "Nagelkerke's R2" = list(m1,m2,m3,m4) %>%
      map(~from_zelig_model(.) %>%
            map(~performance::r2_nagelkerke(.) %>%
                  unlist() %>% unname() %>% round(2)) %>%
            unlist() %>%
            min()),
    "Mean VIF" = list(m1,m2,m3,m4) %>%
      map(~from_zelig_model(.) %>%
            map(~get_vif(.)) %>%
            unlist() %>%
            max() %>%
            round(2)),
    "Chi-squared (df)" = list(m1,m2,m3,m4) %>%
  map(~from_zelig_model(.) %>%
        map_dfr(~get_intercept(.)) %>%
                  filter(rawstat == min(rawstat)) %>%
                  select(cleanstat) %>%
        mutate(cleanstat = as.character(cleanstat)) %>%
                  unlist() %>%
                  unname(), .id = "model")
  ),
  custom.note = "*** signifies p < 0.001, ** signifies p < 0.01, * signifies p < 0.05, and . signifies p < 0.10 (two-tailed tests).<br><br>Note: Goodness of fit test statistics reflect the worst score among models from 5 imputed datasets, including the highest max VIF, the lowest Psuedo-R2, and the lowest Chi-squared statistic among imputations.",
  include.nobs = FALSE
)

```

## 4.2 Zero-inflated 

```{r}
# Import mean/mode imputed data
dat <- read_rds("dataset_mi.rds")
library(Amelia)
library(tidyverse)


# Write a quick function to meld estimates across imputations
get_meld = function(x){
  mi.meld(
    q = x$estimate %>% as.matrix(),
    se = x$std.error %>% as.matrix(),
    byrow = TRUE) %>% 
    as.data.frame() %>%
    rename(estimate = q.mi, std.error = se.mi) %>%
    # Backwards compute the p-value 
    # https://www.bmj.com/content/343/bmj.d2304
    mutate(statistic = estimate / std.error) %>%
    mutate(p.value = exp(-0.717*statistic - 0.416*statistic^2)) %>%
    return()
}

# Run models on each imputation
m1mi <- dat$imputations %>%
  map(~ pscl::zeroinfl(formula = phys_state, dist = "negbin", data =.)) 

m2mi <- dat$imputations %>%
  map(~ pscl::zeroinfl(formula = phys_us, dist = "negbin", data =.))

m3mi <- dat$imputations %>%
  map(~ pscl::zeroinfl(formula = ment_state, dist = "negbin", data =.)) 


m4mi <- dat$imputations %>%
  map(~ pscl::zeroinfl(formula = ment_us, dist = "negbin", data =.)) 


# Get coefficient table from a zero-inflated model
zerocoeftable = function(model){
  
  bind_rows(
    
    summary(model)$coefficients["count"]  %>% 
      as.data.frame() %>%
      tibble::rownames_to_column(var = "term") %>%
      mutate(term = paste("count", term, sep = "_")) %>%
      rename(estimate = 2, std.error = 3, statistic = 4, p.value = 5),
    
    summary(model)$coefficients["zero"]  %>% 
      as.data.frame() %>%
      tibble::rownames_to_column(var = "term") %>%
      mutate(term = paste("zero", term, sep = "_")) %>%
      rename(estimate = 2, std.error = 3, statistic = 4, p.value = 5)
  ) %>%
    return()
}

# Test run here
m1mi[[1]] %>% 
  zerocoeftable() %>%
  select(term) %>%
  left_join(by = "term",
            y = m1mi %>% 
              map_dfr(~zerocoeftable(.), .id = "imp") %>%
              ungroup() %>%
              split(.$term) %>%
              map_dfr(~get_meld(.),.id = "term") %>%
              mutate(model = 3))


# Grab a set of coefficinets
coefficients <- bind_rows(
  m1mi[[1]] %>% zerocoeftable() %>%
    select(term) %>%
    left_join(by = "term",
              y = m1mi %>% 
                map_dfr(~zerocoeftable(.), .id = "imp") %>%
                ungroup() %>%
                split(.$term) %>%
                map_dfr(~get_meld(.),.id = "term") %>%
                mutate(model = 1)),
  
 m2mi[[1]] %>% zerocoeftable() %>%
    select(term) %>%
   left_join(by = "term",
              y = m2mi %>% 
                map_dfr(~zerocoeftable(.), .id = "imp") %>%
                ungroup() %>%
                split(.$term) %>%
                map_dfr(~get_meld(.),.id = "term") %>%
                mutate(model = 2)),
  
  m3mi[[1]] %>% zerocoeftable() %>%
    select(term) %>%
  left_join(by = "term",
              y = m3mi %>% 
                map_dfr(~zerocoeftable(.), .id = "imp") %>%
                ungroup() %>%
                split(.$term) %>%
                map_dfr(~get_meld(.),.id = "term") %>%
                mutate(model = 3)),
  
  m4mi[[1]] %>% zerocoeftable() %>%
   select(term) %>%
   left_join(by = "term",
             y = m4mi %>% 
               map_dfr(~zerocoeftable(.), .id = "imp") %>%
               ungroup() %>%
               split(.$term) %>%
               map_dfr(~get_meld(.),.id = "term") %>%
               mutate(model = 4))
)

# Extract relevant components from this
beta <- coefficients %>%
  split(.$model) %>%
  map(~.$estimate %>% unlist())

se <- coefficients %>%
  split(.$model) %>%
  map(~.$std.error %>% unlist())

p <- coefficients %>%
  split(.$model) %>%
  map(~.$p.value %>% unlist())


# Define variable names
usual_count = list(
  "Count model: diff_self_state_voter_avg" = "Polarization (compared to Average State Voter)",
  "Count model: diff_self_us_voter_avg" = "Polarization (compared to Average US Voter)",
  # Covariates
  
  "Count model: bmi" = "Body Mass Index",
  "Count model: risk_from_smokingyes" = "At Risk from Smoking",
    "Count model: uninsuredYes" = "Uninsured",
  "Count model: party_7" = "Party Identification (7pt) (Dem-Rep)",
  "Count model: age" = "Age",
  "Count model: femaleYes" = "Female",
  "Count model: income2" = "Income", 
  "Count model: raceaa" = "Black", 
  "Count model: racehispanic" = "Hispanic", 
  "Count model: raceasian" = "Asian", 
  "Count model: raceother" = "Other race", 
  

  "Count model: somecollegeYes" = "At least some college", 
  "Count model: nevermarriedyes" = "Never Married", 
  "Count model: employmentnot in labor force" = "Not in Labor Force", 
  "Count model: employmentunemployed" = "Unemployed", 
  "Count model: religionProtestant" = "Protestant",
  "Count model: religionCatholic" = "Catholic", 
  "Count model: religionAnother type of Christian" = "Other Christian",
  "Count model: religionJewish" = "Jewish", 
  "Count model: religionMuslim" = "Muslim",
  "Count model: religionNo religion" = "No religion",  
  "Count model: religionSome other religion" = "Other religion",
  "Count model: (Intercept)" = "Constant"
)



usual_zero = list(
  "Zero model: diff_self_state_voter_avg" = "Polarization (compared to Average State Voter)",
  "Zero model: diff_self_us_voter_avg" = "Polarization (compared to Average US Voter)",
  # Covariates
  "Zero model: bmi" = "Body Mass Index",  
  "Zero model: risk_from_smokingyes" = "At Risk from Smoking",
    "Zero model: uninsuredYes" = "Uninsured",
  "Zero model: party_7" = "Party Identification (7pt) (Dem-Rep)",
  "Zero model: age" = "Age",
  "Zero model: femaleYes" = "Female",
  "Zero model: income2" = "Income", 
  "Zero model: raceaa" = "Black", 
  "Zero model: racehispanic" = "Hispanic", 
  "Zero model: raceasian" = "Asian", 
  "Zero model: raceother" = "Other race", 
  

  "Zero model: somecollegeYes" = "At least some college", 
  "Zero model: nevermarriedyes" = "Never Married", 
  "Zero model: employmentnot in labor force" = "Not in Labor Force", 
  "Zero model: employmentunemployed" = "Unemployed", 
  "Zero model: religionProtestant" = "Protestant",
  "Zero model: religionCatholic" = "Catholic", 
  "Zero model: religionAnother type of Christian" = "Other Christian",
  "Zero model: religionJewish" = "Jewish", 
  "Zero model: religionMuslim" = "Muslim",
  "Zero model: religionNo religion" = "No religion",  
  "Zero model: religionSome other religion" = "Other religion",
  "Zero model: (Intercept)" = "Constant")


# Save the set of objects into one list
modelgroup =  list(m1mi[[1]],
                   m2mi[[1]],
                   m3mi[[1]],
                   m4mi[[1]])

# Generate the table
texreg::htmlreg(
  # Drawing from these models
   modelgroup, 
   # Saved here
  file = "viz/table_b2.html",
  # With these titles
  custom.header = list("Days of Poor Physical Health" = 1:2, 
                    "Days of Poor Mental Health" = 3:4),
  custom.model.names = c("US Level<br>Polarization",
                         "State Level<br>Polarization",
                         "US Level<br>Polarization",
                         "State Level<br>Polarization"),
  # These coefficient names
  custom.coef.map = usual_count,  
  # Override coefficients
  override.coef = beta,
  override.se = se,
  override.pvalues = p,
  bold = 0.10,
  stars = c(0.001, 0.01, 0.05, 0.10),
    groups = list(
    "<b>Perceived Polarization</b>" = 1:2,
    "<b>Health Conditions</b>" = 3:4,
    "<b>Health Insurance</b>" = 5,
    "<b>Partisanship</b>" = 6,
    "<b>Basic Demographics</b>" = 7:14,
    "<b>Extended Demographics</b>" = 15:24),
  caption.above = TRUE,
  caption = "<b>Zero-Inflated Negative Binomial Models of Days of Poor Health Reported by Respondents (n = 2752)</b><br><i>Coefficients as Log-Odds with Standard Errors in Parentheses and Multiple Imputation (n = 5)</i><br><b>Count Model Effects</b>",
  custom.note = "*** signifies p < 0.001, ** signifies p < 0.01, * signifies p < 0.05, and . signifies p < 0.10.",
  single.row = TRUE,
  custom.gof.rows = list(
    "Nagelkerke's R2" = list(m1mi,m2mi,m3mi,m4mi) %>%
      map(. %>% 
            map(~pscl::pR2(.)["r2CU"] %>% round(2)) %>%
            unlist() %>%
            min()) %>%
      unlist()
  )
)


# Zero Model
# Generate the table

# Generate the table
texreg::htmlreg(
  # Drawing from these models
   modelgroup, 
   # Saved here
  file = "viz/table_b3.html",
  # With these titles
  custom.header = list("Days of Poor Physical Health" = 1:2, 
                    "Days of Poor Mental Health" = 3:4),
  custom.model.names = c("US Level<br>Polarization",
                         "State Level<br>Polarization",
                         "US Level<br>Polarization",
                         "State Level<br>Polarization"),
  # These coefficient names
  custom.coef.map = usual_zero,  
  # Override coefficients
  override.coef = beta,
  override.se = se,
  override.pvalues = p,
  bold = 0.10,
  stars = c(0.001, 0.01, 0.05, 0.10),
    groups = list(
    "<b>Perceived Polarization</b>" = 1:2,
    "<b>Health Conditions</b>" = 3:4,
    "<b>Health Insurance</b>" = 5,
    "<b>Partisanship</b>" = 6,
    "<b>Basic Demographics</b>" = 7:14,
    "<b>Extended Demographics</b>" = 15:24),
  caption.above = TRUE,
  caption = "<b>Zero-Inflated Negative Binomial Models of Days of Poor Health Reported by Respondents (n = 2752)</b><br><i>Coefficients as Log-Odds with Standard Errors in Parentheses and Multiple Imputation (n = 5)</i><br><b>Zero Model Effects</b>",
  custom.note = "*** signifies p < 0.001, ** signifies p < 0.01, * signifies p < 0.05, and . signifies p < 0.10.",
  single.row = TRUE,
  custom.gof.rows = list(
    "Nagelkerke's R2" = list(m1mi,m2mi,m3mi,m4mi) %>%
      map(. %>% 
            map(~pscl::pR2(.)["r2CU"] %>% round(2)) %>%
            unlist() %>%
            min()) %>%
      unlist()
  )
)

# Get rid of unnecessary data
rm(list = ls())
```


## 4.3 Marginal Effects


```{r}

get_fd = function(mysimulation){
  mysimulation %>%
    with(sim.out$x1$fd) %>%
    unlist() %>%
    data.frame(fd = .) %>%
    return()
  
}
mysim <- list(
  read_rds("dataset_mi_phys.rds") %>%
    zelig(formula = days_poor_physical_health ~ diff_self_us_voter_avg + 
            bmi + risk_from_smoking, model = "negbin") %>%
    sim(., x = setx(., diff_self_us_voter_avg = 0),
        x1 = setx1(.,  diff_self_us_voter_avg = 1)),
  read_rds("dataset_mi_phys.rds") %>%
    zelig(formula = days_poor_physical_health ~ diff_self_state_voter_avg + 
            bmi + risk_from_smoking, model = "negbin") %>%
    sim(., x = setx(., diff_self_state_voter_avg = 0),
        x1 = setx1(.,  diff_self_state_voter_avg = 1)),
  read_rds("dataset_mi_ment.rds") %>%
    zelig(formula = days_poor_mental_health ~ diff_self_us_voter_avg + 
            bmi + risk_from_smoking, model = "negbin") %>%
    sim(., x = setx(., diff_self_us_voter_avg = 0),
        x1 = setx1(.,  diff_self_us_voter_avg = 1)),
  read_rds("dataset_mi_ment.rds") %>%
    zelig(formula = days_poor_mental_health ~ diff_self_state_voter_avg + 
          bmi + risk_from_smoking, model = "negbin") %>%
    sim(., x = setx(., diff_self_state_voter_avg = 0),
        x1 = setx1(.,  diff_self_state_voter_avg = 1))
) %>%
  map_dfr(~get_fd(.), .id = "model") %>%
  mutate(outcome = case_when(
    model == 1 | model == 2 ~ "Physical Health",
    model == 3 | model == 4 ~ "Mental Health"),
    type = case_when(
      model %in% c(1,3) ~ "US Level\nPerceived Polarization",
      model %in% c(2,4) ~ "State Level\nPerceived Polarization") %>%
      factor(levels = c("US Level\nPerceived Polarization",
                        "State Level\nPerceived Polarization"))) %>%
  group_by(type, outcome) %>%
  filter(fd > quantile(fd, probs = 0.025),
         fd < quantile(fd, probs = 0.975)) %>%
  mutate(group = case_when(
    model == 2 ~ "p < 0.05",
    model != 2 ~ "insignificant") %>%
      factor(levels = c("p < 0.05", "insignificant"))) 

mysum <- mysim %>%
  group_by(type, outcome, group) %>%
  summarize(fd = median(fd), 
            label = round(median(fd), 2))

mysim %>%              
  ggplot(mapping = aes(x = type, y = fd,
                       color = group)) +
  geom_violin(color = "grey", size = 3) +
  geom_violin(draw_quantiles = 0.5) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  geom_label(data = mysum, mapping = aes(label = label),color = "black") +
  facet_wrap(~outcome, ncol = 2) +
  coord_flip() +
  theme_classic(base_size = 14) +
  theme(panel.border = element_rect(fill = NA, color = "black")) +
  scale_color_manual(values = c( "black", "darkgrey")) +
  theme(legend.position = "bottom") +
  labs(x = NULL, color = "Statistical Significance",
       y = "Change in Expected Days of Poor Health\n(with 95% confidence interval)") +
  ggsave("viz/fig_2.png", dpi = 500, width = 8, height = 3.5)
```





## 4.4 Main Effect

```{r}
library(tidyverse)
library(Zelig)

# Run models on each imputation
phys_state = days_poor_physical_health ~ diff_self_state_voter_avg +
  bmi + party_7 + age + race + income2 + female + somecollege +
  nevermarried + employment + uninsured + religion +
  risk_from_smoking

m1 <- read_rds("dataset_mi.rds") %>%
  zelig(formula = phys_state, model = "negbin") 

mysim <- m1 %>%
  setx(diff_self_state_voter_avg = seq(
    from = 0,  to = 10, length.out = 50)) %>%
  sim() %>%
  zelig_qi_to_df() %>%
  select(x = diff_self_state_voter_avg, 
         ev = expected_value)

out <- bind_rows(
  mysim %>%
    group_by(x) %>%
    summarize(median = median(ev),
              lower = quantile(ev, 0.0005),
              upper = quantile(ev, 0.9995),
              ci = "99_9"),
  
  mysim %>%
    group_by(x) %>%
    summarize(median = median(ev),
              lower = quantile(ev, 0.005),
              upper = quantile(ev, 0.995),
              ci = "99"),
  
  mysim %>%
    group_by(x) %>%
    summarize(median = median(ev),
              lower = quantile(ev, 0.025),
              upper = quantile(ev, 0.975),
              ci = "95"),
  
  mysim %>%
    group_by(x) %>%
    summarize(median = median(ev),
              lower = quantile(ev, 0.05),
              upper = quantile(ev, 0.95),
              ci = "90")
) %>%
  pivot_longer(cols = c(lower, upper),
               names_to = "type",
               values_to = "ev") %>%
  # arrange
  arrange(desc(x), ev) %>%
  # Now for each round of simulations
  group_by(x) %>%
  summarize(
    median = median[1],
    lower = c(ev[1], ev[2], ev[3], ev[4],
              ev[5], ev[6], ev[7]),
    upper = c(ev[2], ev[3], ev[4], ev[5],
              ev[6], ev[7], ev[8]),
    group = 1:7) %>%
  # label
  mutate(ci = group %>% recode_factor(
    "1" = "99.9%",
    "2" = "99%",
    "3" = "95%",
    "4" = "90%",
    "5" = "95%",
    "6" = "99%",
    "7" = "99.9%") %>%
      factor(levels = c("90%", "95%", "99%", "99.9%")))

out %>%
  ggplot(mapping = aes(x = x, y = median, 
                       ymin = lower, ymax = upper, 
                       fill = ci, group = group)) +
  geom_ribbon(color = "white") +
  geom_line(color = "white", linetype = "dashed", size = 1.25) +
  geom_label(data = out %>%
               filter(x %in% c(0, 2, 4, 6, 8, 10)),
             mapping = aes(label = round(median, 2)), 
             color = "black", fill = "white", size = 4) +
  scale_fill_grey(start = 0, end = 0.8) +
  theme_classic(base_size = 14) +
  theme(panel.border = element_rect(fill = NA, color = "black"),
        plot.caption = element_text(hjust = 0)) +
  scale_x_continuous(breaks = c(0, 2, 4, 6, 8, 10),
                     limits = c(-0.1, 10.1)) +
  labs(x = "Perceived State Level Polarization (0-10)\n(Perceived Difference from Average State Voter, from Least to Most)",
       y = "Expected Days of Poor Physical Health",
       fill = "Confidence\nInterval",
       caption = "Note: Dashed line portrays median expected days for count model, while bands portray\n90, 95, 99, and 99.9% confidence intervals from 1000 simulations, given a perceived\npolarization score increasing from 0 to 10. Labels indicate expected outcome when\npolarization equals 0 or 10.") +
  ggsave("viz/fig_4.png", dpi = 500, width = 8, height = 5)

```


## 4.5 Expected Outcome

```{r}
library(tidyverse)
library(Zelig)

# Run models on each imputation
phys_state = days_poor_physical_health ~ diff_self_state_voter_avg +
  bmi + party_7 + age + race + income2 + female + somecollege +
  nevermarried + employment + uninsured + religion +
  risk_from_smoking

m1 <- read_rds("dataset_mi_phys.rds") %>%
  zelig(formula = phys_state, model = "negbin") 

m2 <- read_rds("dataset_mi_phys_under_median.rds") %>%
  zelig(formula = phys_state, model = "negbin") 

m3 <- read_rds("dataset_mi_phys_over_median.rds") %>%
  zelig(formula = phys_state, model = "negbin") 


out <- bind_rows(
  m1 %>%
    setx(diff_self_state_voter_avg = 0:10) %>%
    sim() %>%
    zelig_qi_to_df() %>%
    select(x = diff_self_state_voter_avg, 
           ev = expected_value) %>%
    mutate(type = "Days of Poor Physical Health\nOverall") %>%
    group_by(type, x) %>%
    summarize(median = median(ev),
              lower = quantile(ev, 0.025),
              upper = quantile(ev, 0.975)),
  m2 %>%
    setx(diff_self_state_voter_avg = 0:10) %>%
    sim() %>%
    zelig_qi_to_df() %>%
    select(x = diff_self_state_voter_avg, 
           ev = expected_value) %>%
    mutate(type = "Days of Poor Physical Health\nBelow Median") %>%
    group_by(type, x) %>%
    summarize(median = median(ev),
              lower = quantile(ev, 0.025),
              upper = quantile(ev, 0.975)),
  
  m3 %>%
    setx(diff_self_state_voter_avg = 0:10) %>%
    sim() %>%
    zelig_qi_to_df() %>%
    select(x = diff_self_state_voter_avg, 
           ev = expected_value) %>%
    mutate(type = "Days of Poor Physical Health\nAbove Median") %>%
    group_by(type, x) %>%
    summarize(median = median(ev),
              lower = quantile(ev, 0.025),
              upper = quantile(ev, 0.975))
) %>%
    mutate(type = type %>% factor(
    levels = c("Days of Poor Physical Health\nBelow Median","Days of Poor Physical Health\nOverall", "Days of Poor Physical Health\nAbove Median"))) 

# Now get observed data points
dat <- read_rds("dataset_making_us_sick.rds")

points <- bind_rows(
  dat %>%
    filter(days_poor_physical_health > 0 ) %>%
    filter(days_poor_physical_health >= median(days_poor_physical_health)) %>%
    mutate(type = "Days of Poor Physical Health\nAbove Median") %>%
    select(type,days_poor_physical_health, diff_self_state_voter_avg),
  
  dat %>%
    filter(days_poor_physical_health > 0 ) %>%
    filter(days_poor_physical_health < median(days_poor_physical_health)) %>%
    mutate(type = "Days of Poor Physical Health\nBelow Median") %>%
    select(type,days_poor_physical_health, diff_self_state_voter_avg),
  
  dat %>%
    filter(days_poor_physical_health > 0 ) %>%
    mutate(type = "Days of Poor Physical Health\nOverall") %>%
    select(type,days_poor_physical_health, diff_self_state_voter_avg)
) %>%
  mutate(type = type %>% factor(
    levels = c("Days of Poor Physical Health\nBelow Median","Days of Poor Physical Health\nOverall", "Days of Poor Physical Health\nAbove Median"))) %>%
  rename(y = days_poor_physical_health,
         x = diff_self_state_voter_avg)

out %>%
  ggplot(mapping = aes(x = x, y = median, 
                       ymin = lower, ymax = upper, 
                       color = type)) +
  geom_jitter(data = points,
              mapping = aes(
                x = x, y = y,
                ymin = NA_real_,
                ymax = NA_real_),
              alpha = 0.25, width = 0.5, height = 0.5) +
  geom_ribbon(alpha = 0.75, fill = "grey", color = "black", size = 1) +
  geom_line(linetype = "dashed", color = "black") +
  geom_point(data = out %>%
               filter(x %in% c(0, 10)),
             mapping = aes(x = x, y = median, 
                           ymin = NA_real_, ymax = NA_real_), size = 5, color = "black") +
  geom_point(data = out %>%
               filter(x %in% c(0, 10)),
             mapping = aes(x = x, y = median, 
                           ymin = NA_real_, ymax = NA_real_), size = 3, color = "white") +
  
  geom_label(data = out %>%
               filter(x %in% c(0, 10)),
             mapping = aes(x = x, y = upper + 2, 
                           ymin = NA_real_, ymax = NA_real_,
                           label = round(median, 2)), 
             color = "black") +
  facet_grid(~type, scale = "free_y") +
  guides(color = FALSE) +
  scale_color_grey(start = 0.6, end = 0) +
  theme_classic(base_size = 14) +
  theme(panel.border = element_rect(fill = NA, color = "black"),
        plot.caption = element_text(hjust = 0)) +
  scale_x_continuous(breaks = c(0, 2, 4, 6, 8, 10),
                     limits = c(-1, 11)) +
  labs(x = "Perceived State Level Polarization (0-10)\n(Perceived Difference from Average State Voter, from Least to Most)",
       y = "Expected Days of Poor Physical Health",
       caption = "Note: Lines portray median expected days for count model, while bands portray 95% confidence intervals\nfrom 1000 simulations. Large points and labels reflect median expected value given a perceived\npolarization score of 0 vs. 10. Transparent points portray actual data, jittered 0.5 for clarity.") +
  ggsave("viz/fig_2.png", dpi = 500, width = 8, height = 5)

```


## 4.6 Conditions

```{r, message = FALSE, warning = FALSE}

library(tidyverse)
library(Zelig)

# Run models on each imputation
phys_state = days_poor_physical_health ~ diff_self_state_voter_avg +
  bmi + party_7 + age + race + income2 + female + somecollege +
  nevermarried + employment + uninsured + religion +
  risk_from_smoking

# Get basic model
m1 <- read_rds("dataset_mi_phys.rds") %>%
  zelig(formula = phys_state, model = "negbin") 

# Run simulations
m1 %>%
  setx(diff_self_state_voter_avg = seq(from = 0, to = 10, length.out = 20),
       bmi = c(20, 25, 30),
       age = c(20, 40, 60, 80)) %>%
  sim() %>%
  zelig_qi_to_df() %>%
  select(ev = expected_value,
         bmi, age, x = diff_self_state_voter_avg) %>%
  saveRDS("ev.rds")

# 
mysim <- read_rds("ev.rds") %>%
  group_by(bmi, age, x) %>%
  summarize(median = median(ev),
            lower = quantile(ev, 0.025),
            upper = quantile(ev, 0.975)) %>%
  ungroup() %>%
  mutate(group = paste(age, bmi)) %>%
  filter(bmi %in% c(20, 30)) %>%
  mutate(bmi = bmi %>% recode_factor(
    "20" = "Very Healthy BMI (20)",
    "30" = "Obese BMI (30)"),
    age = age %>% recode_factor(
      "20" = "Age 20",
      "40" = "Age 40",
      "60" = "Age 60",
      "80" = "Age 80")) %>%
  mutate(label = if_else(x %in% c(0, 10),
                         true = round(median, 2),
                         false = NA_real_))

mysim %>%
  ggplot(mapping = aes(x = x, y = median, ymin = lower, ymax = upper,
                       color = bmi, fill = factor(bmi), group = group, label = label)) +
  geom_ribbon(alpha = 0.6, color = "black") +
  geom_label(fill = "white") +
  facet_grid(~age) +
  scale_fill_grey(start = 0, end = 0.9) +
  scale_color_grey(start = 0, end = 0.5) +
  theme_classic(base_size = 14) +
  theme(panel.border = element_rect(fill = NA, color = "black"),
        panel.spacing = unit(0.5, "cm"),
        plot.caption = element_text(hjust = 0),
        panel.grid.minor = element_blank(),
        legend.position = "bottom") +
  scale_x_continuous(breaks = c(0,2,4,6,8,10),
                     limits = c(-1, 11)) +
   labs(x = "Perceived State Level Polarization (0-10)\n(Perceived Difference from Average State Voter, from Least to Most)",
       y = "Expected Days\nof Poor Physical Health",
       caption = "Note: Bands reflect 95% confidence interval of expected days of poor physical health for count model,\nbased on 1000 simulations, as perceived polarization score ranges from 0 to 10. Labels indicated median\nexpected values at 0 and 10. Projections broken apart by different age groups and BMI levels.",
       fill = "Body Mass Index") +
  guides(color = FALSE) +
  ggsave("viz/fig_3.png", dpi = 500, width = 8.1, height = 5)

```


# 5. Descriptives

```{r}

# Get table of numeric variables
dat <- read_rds("dataset.rds")  %>%
   select(id, "days_poor_physical_health", "days_poor_mental_health",
                        "diff_self_us_voter_avg", "diff_self_state_voter_avg",
                        "bmi", "party_7", "age", "income2") %>%
  pivot_longer(cols = -c(id), names_to = "measure", values_to = "value") %>%
  
  mutate(measure = measure %>% recode_factor(
    "days_poor_physical_health" = "Days of Poor\nPhysical Health",
    "days_poor_mental_health" = "Days of Poor\nMental Health",
    "diff_self_us_voter_avg" = "US Level\nPolarization",
    "diff_self_state_voter_avg" = "State Level\nPolarization",
    # Covariates
    "bmi" = "Body Mass Index",
    "party_7" = "Party Identification\n(7pt) (Dem-Rep)",
    "age" = "Age",
    "income2" = "Income")) %>%
  group_by(measure) %>%
  summarize(
    `Mean` = mean(value, na.rm = TRUE) %>% round(2),
    `Median` = median(value, na.rm = TRUE) %>% round(2),
    `Std. Dev.` = sd(value, na.rm = TRUE) %>% round(2),
    `Min` = min(value, na.rm = TRUE) %>% round(2),
    `Max` = max(value, na.rm = TRUE) %>% round(2),
    `Obs.` = sum(!is.na(value)) %>% round(0),
    `% Missing` = round(sum(is.na(value)) / n() * 100, 1)  ) %>%
  ungroup() %>%
  mutate_at(vars(`Mean`:`% Missing`), funs(as.character(.))) %>%
  pivot_longer(cols = -c(measure), names_to = "type", values_to = "stat") %>%
  mutate(type = factor(type, levels = c("Mean", "Median", "Std. Dev.", 
                                        "Min", "Max", "Obs.", "% Missing")))

g1 <- dat %>%
  mutate(order = as.numeric(measure)) %>%
  ggplot(mapping = aes(x = "", y = reorder(measure, -order), label = stat)) +
  geom_tile(fill = "white", color = "darkgrey") +
  geom_text(color = "black") +
  scale_x_discrete(position = "top") +
  theme_classic(base_size = 14) +
  theme(panel.border = element_rect(fill = NA, color = "black"),
        axis.ticks.x = element_blank())  +
  facet_grid(~type, scales = "free") +
  labs(x = "Descriptive Statistics (Continuous Variables)",
       y = NULL) 
#    ggsave("viz/table_a4.png", dpi = 500, width = 8.5, height = 6)


# Repeat for Categorical Variables
dat <- read_rds("dataset.rds") %>%
   select(id, days_poor_physical_health, days_poor_mental_health,
         race, female, somecollege,
         nevermarried, employment, uninsured, religion,
         risk_from_smoking) %>%
  mutate(freq_physical_distress = if_else(days_poor_physical_health >= 14, 1, 0),
         freq_mental_distress = if_else(days_poor_mental_health >= 14, 1, 0)) %>%
  select(-days_poor_physical_health, -days_poor_mental_health) %>%
  fastDummies::dummy_cols(select_columns = names(.)[-1],
                          ignore_na = TRUE,remove_selected_columns = TRUE)  %>%
  pivot_longer(cols = -c(id), names_to = "measure", values_to = "value") %>%
  filter(measure %in% c("female_Yes",
                        "race_aa","race_white", "race_hispanic", "race_asian", "race_other",
                        "somecollege_Yes", "nevermarried_yes", "employment_not in labor force",
                        "employment_unemployed", "employment_employed",
                        "religion_Protestant", "religion_Catholic", 
                        "religion_Another type of Christian","religion_Jewish",
                        "religion_Muslim", "religion_No religion",
                   "religionSome other religion",
                   "uninsured_Yes",
                   "risk_from_smoking_yes")) %>%
  mutate(measure = measure %>% recode_factor(
  "female_Yes" = "Female",
  "race_white" = "White",
  "race_aa" = "Black", 
  "race_hispanic" = "Hispanic", 
  "race_asian" = "Asian", 
  "race_other" = "Other race", 
  "somecollege_Yes" = "At least some college", 
  "nevermarried_yes" = "Never Married", 
  "employment_not in labor force" = "Not in Labor Force", 
  "employment_unemployed" = "Unemployed", 
  "employment_employed" = "Employed",
  "religion_Protestant" = "Protestant",
  "religion_Catholic" = "Catholic", 
  "religion_Another type of Christian" = "Other Christian",
  "religion_Jewish" = "Jewish", 
  "religion_Muslim" = "Muslim",
  "religion_No religion" = "No religion",  
  "religion_Some other religion" = "Other religion",
  "uninsured_Yes" = "Uninsured",
  "risk_from_smoking_yes" = "Risk from Smoking")) %>%
  group_by(measure) %>%
  summarize(
    `Count` = sum(value == 1, na.rm = TRUE) %>% round(0),
    `% Frequency` = round(sum(value == 1, na.rm = TRUE) / n() * 100, 1),
    `Obs.` = sum(!is.na(value)) %>% round(0),
    `% Missing` = round(sum(is.na(value)) / n() * 100, 1)  ) %>%
  ungroup() %>%
  mutate_at(vars(`Count`:`% Missing`), funs(as.character(.))) %>%
  pivot_longer(cols = -c(measure), names_to = "type", values_to = "stat") %>%
  mutate(type = factor(type, levels = c("Count", "% Frequency",
                                       "Obs.", "% Missing")))


g2 <- dat %>%
  mutate(order = as.numeric(measure)) %>%
  ggplot(mapping = aes(x = "", y = reorder(measure, -order), label = stat)) +
  geom_tile(fill = "white", color = "darkgrey") +
  geom_text(color = "black") +
  scale_x_discrete(position = "top") +
  theme_classic(base_size = 14) +
  theme(panel.border = element_rect(fill = NA, color = "black"),
        axis.ticks.x = element_blank())  +
  facet_grid(~type, scales = "free") +
  labs(x = "Descriptive Statistics (Categorical Variables)",
       y = NULL) 


ggpubr::ggarrange(g1,g2, ncol = 1, heights = c(4, 4.5)) +
  ggsave("viz/table_a4.png", dpi = 500, width = 8, height = 8)

remove(dat, g1,g2)
```
